set.seed(2021)
library(MouseGastrulationData)
library(ggplot2)
library(reshape)
library(org.Mm.eg.db)
Load functions
source("../scripts/StabMap_functions.R")
Sample 29, corresponding to E8.5 timepoint. And cell type colours for plotting.
SCE <- EmbryoAtlasData(samples = 29)
SCE
## class: SingleCellExperiment
## dim: 29452 7569
## metadata(0):
## assays(1): counts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(7569): cell_95727 cell_95728 ... cell_103294 cell_103295
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
celltype_colours = MouseGastrulationData::EmbryoCelltypeColours
celltype_colours
## Epiblast Primitive Streak
## "#635547" "#DABE99"
## Caudal epiblast PGC
## "#9E6762" "#FACB12"
## Anterior Primitive Streak Notochord
## "#C19F70" "#0F4A9C"
## Def. endoderm Gut
## "#F397C0" "#EF5A9D"
## Nascent mesoderm Mixed mesoderm
## "#C594BF" "#DFCDE4"
## Intermediate mesoderm Caudal Mesoderm
## "#139992" "#3F84AA"
## Paraxial mesoderm Somitic mesoderm
## "#8DB5CE" "#005579"
## Pharyngeal mesoderm Cardiomyocytes
## "#C9EBFB" "#B51D8D"
## Allantois ExE mesoderm
## "#532C8A" "#8870AD"
## Mesenchyme Haematoendothelial progenitors
## "#CC7818" "#FBBE92"
## Endothelium Blood progenitors 1
## "#FF891C" "#F9DECF"
## Blood progenitors 2 Erythroid1
## "#C9A997" "#C72228"
## Erythroid2 Erythroid3
## "#F79083" "#EF4E22"
## NMP Rostral neurectoderm
## "#8EC792" "#65A83E"
## Caudal neurectoderm Neural crest
## "#354E23" "#C3C388"
## Forebrain/Midbrain/Hindbrain Spinal cord
## "#647A4F" "#CDE088"
## Surface ectoderm Visceral endoderm
## "#F7F79E" "#F6BFCB"
## ExE endoderm ExE ectoderm
## "#7F6874" "#989898"
## Parietal endoderm
## "#1A1A1A"
random_500 = sample(rownames(SCE), 500)
random_1000 = sample(rownames(SCE), 1000)
random_2000 = sample(rownames(SCE), 2000)
random_5000 = sample(rownames(SCE), 5000)
This will be re-used multiple times. Note this is a dense matrix, as output from igraph::similarity.
full_sim = generateSimilarity(SCE)
dim(full_sim)
## [1] 7569 7569
full_sim[1:5,1:5]
## cell_95727 cell_95728 cell_95729 cell_95730 cell_95731
## cell_95727 1 0 0 0 0
## cell_95728 0 1 0 0 0
## cell_95729 0 0 1 0 0
## cell_95730 0 0 0 1 0
## cell_95731 0 0 0 0 1
If plot = TRUE then a UMAP embedding is generated among the subset features. I can add an additional UMAP scatterplot with more information using the plotAdditional argument, which is itself a list of the proto objects to add, here is colouring the cells based on their cell type, with additional ggplot style arguments passed in the second object within the list.
plotAdditional = list("celltype",
list(scale_colour_manual(values = celltype_colours),
theme(legend.position = "bottom")))
scores_500 = getSubsetUncertainty(SCE,
subsetGenes = random_500,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional)
scores_1000 = getSubsetUncertainty(SCE,
subsetGenes = random_1000,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional)
scores_2000 = getSubsetUncertainty(SCE,
subsetGenes = random_2000,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional)
scores_5000 = getSubsetUncertainty(SCE,
subsetGenes = random_5000,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional)
scores_df = data.frame(cell = colnames(SCE),
score_500 = scores_500,
score_1000 = scores_1000,
score_2000 = scores_2000,
score_5000 = scores_5000,
celltype = SCE$celltype)
As the number of random genes increases, the uncertainty score tends to decrease.
ggplot(scores_df, aes(x = score_500, y = score_1000)) +
geom_point(aes(colour = celltype)) +
theme_classic() +
geom_abline(slope = 1, intercept = 0) +
scale_colour_manual(values = celltype_colours) +
NULL
ggplot(scores_df, aes(x = score_1000, y = score_2000)) +
geom_point(aes(colour = celltype)) +
theme_classic() +
geom_abline(slope = 1, intercept = 0) +
scale_colour_manual(values = celltype_colours) +
NULL
ggplot(scores_df, aes(x = score_2000, y = score_5000)) +
geom_point(aes(colour = celltype)) +
theme_classic() +
geom_abline(slope = 1, intercept = 0) +
scale_colour_manual(values = celltype_colours) +
NULL
The decrease in uncertainty score is varied between cell types, e.g. for Gut this is quite pronounced, while not so much for the Erythroid types.
scores_df_long = melt(scores_df, id.vars = c("cell", "celltype"))
ggplot(scores_df_long, aes(x = interaction(variable, celltype), y = value)) +
geom_boxplot(aes(fill = celltype)) +
theme_classic() +
scale_fill_manual(values = celltype_colours) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("") +
theme(legend.position = "bottom") +
NULL
I can also estimate the uncertainty for new data, given the subset. Here, I extract another sample from the MouseGastrulationData package, sample 36, corresponding to again a E8.5 timepoint. For this sample I restrict to genes that are related to cell cycle.
cycle.anno <- select(org.Mm.eg.db, keytype="GOALL", keys="GO:0007049",
columns="ENSEMBL")[,"ENSEMBL"]
cellCycleGenes = intersect(cycle.anno, rownames(SCE))
head(cellCycleGenes)
## [1] "ENSMUSG00000026842" "ENSMUSG00000029580" "ENSMUSG00000026836"
## [4] "ENSMUSG00000000532" "ENSMUSG00000052593" "ENSMUSG00000022893"
length(cellCycleGenes)
## [1] 1635
SCE_query <- EmbryoAtlasData(samples = 36)
SCE_query
## class: SingleCellExperiment
## dim: 29452 4915
## metadata(0):
## assays(1): counts
## rownames(29452): ENSMUSG00000051951 ENSMUSG00000089699 ...
## ENSMUSG00000096730 ENSMUSG00000095742
## rowData names(2): ENSEMBL SYMBOL
## colnames(4915): cell_130406 cell_130407 ... cell_135319 cell_135320
## colData names(17): cell barcode ... colour sizeFactor
## reducedDimNames(2): pca.corrected umap
## altExpNames(0):
Calculate uncertainty scores for both datasets, using the cell cycle genes. This includes a call to cbind of the SingleCellExperiment objects, which is not trivial and should be used with care.
scores_CC = getSubsetUncertainty(SCE,
querySCE = SCE_query,
subsetGenes = cellCycleGenes,
full_sim = full_sim,
plot = TRUE,
plotAdditional = plotAdditional)
length(scores_CC)
## [1] 12484
head(scores_CC)
## cell_95727 cell_95728 cell_95729 cell_95730 cell_95731 cell_95732
## 0.1600000 0.6163265 0.1542857 0.5746939 0.6481633 0.6800000
scores_cc = data.frame(cell = names(scores_CC),
score = scores_CC,
celltype = cbind(SCE, SCE_query)[,names(scores_CC)]$celltype,
sample = cbind(SCE, SCE_query)[,names(scores_CC)]$sample)
ggplot(scores_cc, aes(x = interaction(sample, celltype), y = score)) +
geom_boxplot(aes(fill = celltype)) +
theme_classic() +
scale_fill_manual(values = celltype_colours) +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
xlab("") +
theme(legend.position = "bottom") +
NULL
sessionInfo()
## R version 4.0.4 (2021-02-15)
## Platform: x86_64-apple-darwin19.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
##
## Matrix products: default
## BLAS: /usr/local/Cellar/openblas/0.3.13/lib/libopenblasp-r0.3.13.dylib
## LAPACK: /usr/local/Cellar/r/4.0.4/lib/R/lib/libRlapack.dylib
##
## locale:
## [1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] patchwork_1.1.1 BiocNeighbors_1.8.2
## [3] scuttle_1.0.4 scater_1.18.3
## [5] igraph_1.2.6 bluster_1.0.0
## [7] scran_1.18.3 org.Mm.eg.db_3.12.0
## [9] AnnotationDbi_1.52.0 reshape_0.8.8
## [11] ggplot2_3.3.3 MouseGastrulationData_1.4.0
## [13] SingleCellExperiment_1.12.0 SummarizedExperiment_1.20.0
## [15] Biobase_2.50.0 GenomicRanges_1.42.0
## [17] GenomeInfoDb_1.26.2 IRanges_2.24.1
## [19] S4Vectors_0.28.1 BiocGenerics_0.36.0
## [21] MatrixGenerics_1.2.0 matrixStats_0.57.0
##
## loaded via a namespace (and not attached):
## [1] bitops_1.0-6 bit64_4.0.5
## [3] RcppAnnoy_0.0.18 httr_1.4.2
## [5] tools_4.0.4 R6_2.5.0
## [7] irlba_2.3.3 vipor_0.4.5
## [9] uwot_0.1.10 DBI_1.1.1
## [11] colorspace_2.0-0 withr_2.4.1
## [13] gridExtra_2.3 tidyselect_1.1.0
## [15] bit_4.0.4 curl_4.3
## [17] compiler_4.0.4 DelayedArray_0.16.1
## [19] labeling_0.4.2 scales_1.1.1
## [21] rappdirs_0.3.3 stringr_1.4.0
## [23] digest_0.6.27 rmarkdown_2.6
## [25] XVector_0.30.0 pkgconfig_2.0.3
## [27] htmltools_0.5.1.1 sparseMatrixStats_1.2.0
## [29] dbplyr_2.1.0 fastmap_1.1.0
## [31] limma_3.46.0 rlang_0.4.10
## [33] RSQLite_2.2.3 shiny_1.6.0
## [35] DelayedMatrixStats_1.12.2 farver_2.0.3
## [37] generics_0.1.0 BiocParallel_1.24.1
## [39] dplyr_1.0.3 RCurl_1.98-1.2
## [41] magrittr_2.0.1 BiocSingular_1.6.0
## [43] GenomeInfoDbData_1.2.4 Matrix_1.3-2
## [45] ggbeeswarm_0.6.0 Rcpp_1.0.6
## [47] munsell_0.5.0 viridis_0.5.1
## [49] lifecycle_0.2.0 stringi_1.5.3
## [51] yaml_2.2.1 edgeR_3.32.1
## [53] zlibbioc_1.36.0 plyr_1.8.6
## [55] BiocFileCache_1.14.0 AnnotationHub_2.22.0
## [57] grid_4.0.4 blob_1.2.1
## [59] promises_1.1.1 dqrng_0.2.1
## [61] ExperimentHub_1.16.0 crayon_1.3.4
## [63] lattice_0.20-41 cowplot_1.1.1
## [65] beachmat_2.6.4 locfit_1.5-9.4
## [67] knitr_1.30 pillar_1.4.7
## [69] codetools_0.2-18 glue_1.4.2
## [71] BiocVersion_3.12.0 evaluate_0.14
## [73] BiocManager_1.30.10 vctrs_0.3.6
## [75] httpuv_1.5.5 gtable_0.3.0
## [77] purrr_0.3.4 assertthat_0.2.1
## [79] cachem_1.0.1 xfun_0.20
## [81] rsvd_1.0.3 mime_0.9
## [83] xtable_1.8-4 RSpectra_0.16-0
## [85] later_1.1.0.1 viridisLite_0.3.0
## [87] tibble_3.0.5 beeswarm_0.2.3
## [89] memoise_2.0.0 statmod_1.4.35
## [91] ellipsis_0.3.1 interactiveDisplayBase_1.28.0